apiVersion: ray.io/v1alpha1
kind: RayService
metadata:
  name: rayllm
spec:
  serviceUnhealthySecondThreshold: 1200 # Config for the health check threshold for service. Default value is 60.
  deploymentUnhealthySecondThreshold: 1200 # Config for the health check threshold for deployments. Default value is 60.
  serveConfigV2: |
      applications:
      - name: router
        import_path: rayllm.backend:router_application
        route_prefix: /
        args:
          models:
            - ./models/continuous_batching/amazon--LightGPT.yaml
  rayClusterConfig:
    # Ray head pod template
    headGroupSpec:
      # The `rayStartParams` are used to configure the `ray start` command.
      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
      rayStartParams:
        resources: '"{\"accelerator_type_cpu\": 2}"'
        dashboard-host: '0.0.0.0'
      #pod template
      template:
        spec:
          containers:
          - name: ray-head
            image: anyscale/ray-llm:latest
            resources:
              limits:
                cpu: 2
                memory: 8Gi
              requests:
                cpu: 2
                memory: 8Gi
            ports:
            - containerPort: 6379
              name: gcs-server
            - containerPort: 8265 # Ray dashboard
              name: dashboard
            - containerPort: 10001
              name: client
            - containerPort: 8000
              name: serve
    workerGroupSpecs:
    # the pod replicas in this group typed worker
    - replicas: 1
      minReplicas: 0
      maxReplicas: 1
      # logical group name, for this called small-group, also can be functional
      groupName: gpu-group
      rayStartParams:
        resources: '"{\"accelerator_type_cpu\": 48, \"accelerator_type_a10\": 2, \"accelerator_type_a100_80g\": 2}"'
      # Pod template
      template:
        spec:
          containers:
          - name: llm
            image: anyscale/ray-llm:latest
            lifecycle:
              preStop:
                exec:
                  command: ["/bin/sh","-c","ray stop"]
            resources:
              limits:
                cpu: "48"
                memory: "192G"
                nvidia.com/gpu: 4
              requests:
                cpu: "36"
                memory: "128G"
                nvidia.com/gpu: 4
            ports:
            - containerPort: 8000
              name: serve
          # Please add the following taints to the GPU node.
          tolerations:
            - key: "ray.io/node-type"
              operator: "Equal"
              value: "worker"
              effect: "NoSchedule"
          nodeSelector:
            cloud.google.com/gke-accelerator: nvidia-l4-vws
